package com.iflytek.jzcpx.procuracy.card.common;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.xml.sax.SAXException;

/**
 * @author <a href=mailto:ktyi@iflytek.com>伊开堂</a>
 * @date 2019/8/21 9:39
 */
public final class FileUtils {

    /**
     * 抽取word文档的纯文本内容
     *
     * @param is 输入流
     *
     * @return
     */
    public static String extractWordText(InputStream is) throws IOException, SAXException {
        String text = null;
        try (BufferedInputStream bis = new BufferedInputStream(is)) {
            final FileMagic fileMagic = FileMagic.valueOf(bis);
            if (fileMagic == FileMagic.OLE2) {
                ZipSecureFile.setMinInflateRatio(-1.0d);
                final WordExtractor extractor = new WordExtractor(bis);
                text = extractor.getText();
                extractor.close();
            }
            else if (fileMagic == FileMagic.OOXML) {
                ZipSecureFile.setMinInflateRatio(-1.0d);
                XWPFDocument doc = new XWPFDocument(bis);
                final XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
                text = extractor.getText();
                extractor.close();
            }
        }

        return text;
    }

    /**
     * 判断文件是否是word文档（doc,docx,wps）
     *
     * @param jzwjlx 文件后缀
     *
     * @return true: DOC, Docx, .Doc, .WPS
     */
    public static boolean isWordFile(String jzwjlx) {
        if (StringUtils.isBlank(jzwjlx)) {
            return false;
        }
        return StringUtils.equalsAnyIgnoreCase(jzwjlx, "doc", "docx", "wps")
                || StringUtils.endsWithAny(jzwjlx.toLowerCase(), "doc", "docx", "wps");
    }
}
